In [7]:
from pyspark.sql.types import StructType
import json
In [1]:
data_path = "wiki_edit_data.json"
#read in the data, sadly without a schema
wiki_edits = sqlCtx.read.json(data_path)
In [14]:
wiki_edits.printSchema()
In [2]:
#original schema
wiki_edits.schema
Out[2]:
In [3]:
s = wiki_edits.schema.json()
s
Out[3]:
In [4]:
type(wiki_edits.schema.json())
Out[4]:
In [5]:
#now the data is in a string format, to be able to dump it we really want it in a json format
#for this we use json.loads()
In [8]:
with open('wiki_schema.json', 'w') as f:
json.dump(json.loads(s), f)
In [15]:
#file is saved! Lets make sure if worked by loading it back in :)
In [9]:
with open('wiki_schema.json', 'r') as f:
json_in = json.load(f)
In [10]:
#what does the current format look like now?
json_in
Out[10]:
In [11]:
#to be able to use we need to have a StructType again
schema_in = StructType.fromJson(json_in)
type(schema_in)
Out[11]:
In [12]:
#read in the data again, this time with the schema!!!!
wiki_edits2 = sqlCtx.read.json(data_path, schema=StructType.fromJson(json_in))
In [13]:
#Now loading is a whole lot faster!!! but we should check the schema just to make sure the trolls didn't eat anything
wiki_edits2.printSchema()
In [ ]: